{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "%pylab inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from __future__ import print_function\n",
    "from __future__ import division"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import seaborn as sns\n",
    "import pickle\n",
    "import unicodedata\n",
    "import time\n",
    "import sklearn\n",
    "from sklearn.preprocessing import MultiLabelBinarizer\n",
    "from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n",
    "from sklearn.svm import LinearSVC\n",
    "from sklearn.cross_validation import train_test_split\n",
    "from sklearn.multiclass import OneVsRestClassifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "issues = pickle.load(open(\"subset_issue.pkl\"))\n",
    "comment_text = pickle.load(open(\"comment_text.pkl\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "table for removing punctuation from text."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "table = dict.fromkeys(i for i in xrange(sys.maxunicode)\n",
    "                      if unicodedata.category(unichr(i)).startswith('P'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Clean The text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def get_text_components_per_issue(issues):\n",
    "    text_per_issue = []\n",
    "    components_per_issue = []\n",
    "\n",
    "    for index, row in issues.iterrows():\n",
    "        issue_text = \"\"\n",
    "        for comment_id in row[\"comments\"]:\n",
    "            text = comment_text[comment_id].strip()\n",
    "            # Remove punctuation\n",
    "            text = text.translate(table)\n",
    "            issue_text += text + \" \"\n",
    "        text_per_issue.append(issue_text.strip())\n",
    "\n",
    "        components_per_issue.append(set(row[\"components\"]))\n",
    "    \n",
    "    return text_per_issue, components_per_issue\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "text_per_issue, components_per_issue = get_text_components_per_issue(issues)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Filter out components that are used infrequently(not enough singal) or too frequently (signal not meaningful)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def prune_and_bin_components(components_per_issue, prune_low=0.005, prune_high=0.25):\n",
    "    mlb = MultiLabelBinarizer()\n",
    "    bins = mlb.fit_transform(components_per_issue)\n",
    "    exclude_comp_ids = set(mlb.classes_[~(((bins.sum(axis=0) / bins.sum()) > prune_low) & \n",
    "                                        ((bins.sum(axis=0) / bins.sum()) < prune_high))])\n",
    "    \n",
    "    comps_per_issue_exclude = []\n",
    "    for comp_set in components_per_issue:\n",
    "        comps = comp_set - exclude_comp_ids\n",
    "        comps_per_issue_exclude.append(comps)\n",
    "    \n",
    "    mlb = MultiLabelBinarizer()\n",
    "    bins = mlb.fit_transform(comps_per_issue_exclude)\n",
    "    return bins, mlb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "bins, mlb = prune_and_bin_components(components_per_issue)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Tokenize the text and perform tfidf transformations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "bigram_vectorizer = CountVectorizer(ngram_range=(1, 2),\n",
    "                                    token_pattern=r'\\b\\w+\\b',\n",
    "                                    min_df=5,\n",
    "                                    max_df=0.5,\n",
    "                                    stop_words='english')\n",
    "\n",
    "tfidf_transformer =  TfidfTransformer()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "counts = bigram_vectorizer.fit_transform(text_per_issue)\n",
    "tfidf = tfidf_transformer.fit_transform(counts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(tfidf, bins, train_size=0.8, random_state=42)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Train a very simple linear model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "clf = OneVsRestClassifier(LinearSVC(C=1.0))\n",
    "clf.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Predict and analyze the results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "predictions = clf.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "(y_test == predictions).sum() / (y_test.shape[0] * y_test.shape[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "np.sum((y_test == predictions).sum(axis=1) == 44) / y_test.shape[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "sns.distplot(y_test.sum(axis=1), kde=False)\n",
    "sns.distplot(predictions.sum(axis=1), kde=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "sns.barplot(range(44), y_test.sum(axis=0), color=\"red\")\n",
    "sns.barplot(range(44), predictions.sum(axis=0), color=\"blue\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Serialize the data and the model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def serialize_data_model(vectorizer, classifier, features, targets, transformer=None):\n",
    "    current_time = int(time.time())\n",
    "    pickle.dump(vectorizer, open(\"{}-vectorizer.pkl\".format(current_time), \"wb\"))\n",
    "    pickle.dump(classifier, open(\"{}-classifier.pkl\".format(current_time), \"wb\"))\n",
    "    \n",
    "    training = {\"features\": features, \"targets\": targets}\n",
    "    pickle.dump(training, open(\"{}.pkl\".format(current_time), \"wb\"))\n",
    "    \n",
    "    if transformer:\n",
    "        pickle.dump(transformer, open(\"{}-transformer.pkl\".format(current_time), \"wb\"))\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "serialize_data_model(bigram_vectorizer, clf, tfidf, bins, tfidf_transformer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
